In [ ]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import dash as dash
# This ensures Plotly output works in multiple places:
# plotly_mimetype: VS Code notebook UI
# notebook: "Jupyter: Export to HTML" command in VS Code
# See https://plotly.com/python/renderers/#multiple-renderers
# CREDIT FROM congyiwu from https://github.com/microsoft/vscode-jupyter/issues/6999
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
In [ ]:
data = pd.read_csv('VideoGamesSales.csv')
# read the data
In [ ]:
# Statistics yeah
# now we analyse the data a bit
# It was just for my own discovery
print(data.info())
#16719 entries, 16 columns
print(data.describe())
# statistics of the data, min max is what I am mostly interested from here
print(data.head())
# some NaN values in the data
# Year, User count and Critic data is float, I'll change them to int
data['Year_of_Release'] = data['Year_of_Release'].astype('Int64')
data['User_Count'] = data['User_Count'].astype('Int64')
data['Critic_Count'] = data['Critic_Count'].astype('Int64')
pd.set_option('display.max_rows', None)
data.isnull().sum()
# a lot of NaN values in the data in critic ratings, user ratings
# whatever are Developer and Rating
# probably the publisher and developer are the same at the ones that the developer is missing
count = 0
for index in data.index:
count = count + data['JP_Sales'][index] + data['EU_Sales'][index] + data['NA_Sales'][index] + data['Other_Sales'][index]
print(count)
print(data['Global_Sales'].sum())
# 1235 Publisher == Developer
# 15484 Publisher != Developer with NaNs
# data.dropna(subset=['Critic_Score', 'Publisher'], inplace=True)
sum = 0
for index in data.index:
if data['Publisher'][index]=='Nintendo':
sum = sum + data['Global_Sales'][index]
print("Total Global Sales Nintendo:", sum)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 16717 non-null object
1 Platform 16719 non-null object
2 Year_of_Release 16450 non-null float64
3 Genre 16717 non-null object
4 Publisher 16665 non-null object
5 NA_Sales 16719 non-null float64
6 EU_Sales 16719 non-null float64
7 JP_Sales 16719 non-null float64
8 Other_Sales 16719 non-null float64
9 Global_Sales 16719 non-null float64
10 Critic_Score 8137 non-null float64
11 Critic_Count 8137 non-null float64
12 User_Score 7590 non-null float64
13 User_Count 7590 non-null float64
14 Developer 10096 non-null object
15 Rating 9950 non-null object
dtypes: float64(10), object(6)
memory usage: 2.0+ MB
None
Year_of_Release NA_Sales EU_Sales JP_Sales \
count 16450.000000 16719.000000 16719.000000 16719.000000
mean 2006.487356 0.263330 0.145025 0.077602
std 5.878995 0.813514 0.503283 0.308818
min 1980.000000 0.000000 0.000000 0.000000
25% 2003.000000 0.000000 0.000000 0.000000
50% 2007.000000 0.080000 0.020000 0.000000
75% 2010.000000 0.240000 0.110000 0.040000
max 2020.000000 41.360000 28.960000 10.220000
Other_Sales Global_Sales Critic_Score Critic_Count User_Score \
count 16719.000000 16719.000000 8137.000000 8137.000000 7590.000000
mean 0.047332 0.533543 68.967679 26.360821 7.125046
std 0.186710 1.547935 13.938165 18.980495 1.500006
min 0.000000 0.010000 13.000000 3.000000 0.000000
25% 0.000000 0.060000 60.000000 12.000000 6.400000
50% 0.010000 0.170000 71.000000 21.000000 7.500000
75% 0.030000 0.470000 79.000000 36.000000 8.200000
max 10.570000 82.530000 98.000000 113.000000 9.700000
User_Count
count 7590.000000
mean 162.229908
std 561.282326
min 4.000000
25% 10.000000
50% 24.000000
75% 81.000000
max 10665.000000
Name Platform Year_of_Release Genre Publisher \
0 Wii Sports Wii 2006.0 Sports Nintendo
1 Super Mario Bros. NES 1985.0 Platform Nintendo
2 Mario Kart Wii Wii 2008.0 Racing Nintendo
3 Wii Sports Resort Wii 2009.0 Sports Nintendo
4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo
NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score \
0 41.36 28.96 3.77 8.45 82.53 76.0
1 29.08 3.58 6.81 0.77 40.24 NaN
2 15.68 12.76 3.79 3.29 35.52 82.0
3 15.61 10.93 3.28 2.95 32.77 80.0
4 11.27 8.89 10.22 1.00 31.37 NaN
Critic_Count User_Score User_Count Developer Rating
0 51.0 8.0 322.0 Nintendo E
1 NaN NaN NaN NaN NaN
2 73.0 8.3 709.0 Nintendo E
3 73.0 8.0 192.0 Nintendo E
4 NaN NaN NaN NaN NaN
8916.060000004225
8920.300000000001
Total Global Sales Nintendo: 1788.8099999999977
In [ ]:
data.isnull().sum()
Out[ ]:
Name 2 Platform 0 Year_of_Release 269 Genre 2 Publisher 54 NA_Sales 0 EU_Sales 0 JP_Sales 0 Other_Sales 0 Global_Sales 0 Critic_Score 8582 Critic_Count 8582 User_Score 9129 User_Count 9129 Developer 6623 Rating 6769 dtype: int64
In [ ]:
total_sales = data[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum()
fig = go.Figure(data=go.Bar(
x=total_sales.index,
y=total_sales.values,
marker_color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'],
))
fig.update_layout(
title='Video Game Sales Distribution by Region',
xaxis=dict(title='Region'),
yaxis=dict(title='Total Sales (in millions)'),
)
fig.show()
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
numerical_columns = ['Year_of_Release', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count']
sns.pairplot(data[numerical_columns])
plt.show()
In [ ]:
yearly_counts = data['Year_of_Release'].value_counts().sort_index().reset_index()
yearly_counts.columns = ['Year', 'Number of Games Released']
fig = px.line(yearly_counts, x='Year', y='Number of Games Released', title='Total Number of Games Released Each Year (1980 - 2020)', markers=True)
fig.update_layout(xaxis_title='Year of Release', yaxis_title='Number of Games Released')
fig.update_layout(width=1000, height=600)
fig.show()
In [ ]:
yearly_counts = data['Year_of_Release'].value_counts().sort_index().reset_index()
yearly_counts.columns = ['Year', 'Number of Games Released']
fig = px.bar(yearly_counts, x='Year', y='Number of Games Released', title='Total Number of Games Released Each Year (1980 - 2020)')
fig.update_layout(xaxis_title='Year of Release', yaxis_title='Number of Games Released')
fig.update_layout(width=1000, height=600)
fig.show()
In [ ]:
count_2017 = 0
count_2018 = 0
count_2019 = 0
count_2020 = 0
for index in data.index:
if pd.Series(data['Year_of_Release'][index]).notnull().bool():
if data['Year_of_Release'][index] == 2017:
count_2017 += 1
elif data['Year_of_Release'][index] == 2018:
count_2018 += 1
elif data['Year_of_Release'][index] == 2019:
count_2019 += 1
elif data['Year_of_Release'][index] == 2020:
count_2020 += 1
print("Games released in 2017:", count_2017)
print("Games released in 2018:", count_2018)
print("Games released in 2019:", count_2019)
print("Games released in 2020:", count_2020)
C:\Users\jstge\AppData\Local\Temp\ipykernel_2544\3633535417.py:7: FutureWarning: Series.bool is now deprecated and will be removed in future version of pandas
Games released in 2017: 3 Games released in 2018: 0 Games released in 2019: 0 Games released in 2020: 1
In [ ]:
years = [2017, 2018, 2019, 2020]
counts = [3, 0, 0, 1]
df = pd.DataFrame({'Year': years, 'Games Released': counts})
fig = go.Figure(data=go.Bar(
x=df['Year'],
y=df['Games Released'],
#marker_color='blue'
))
fig.update_layout(
title='Total Number of Games Released Each Year (2017 - 2020)',
xaxis_title='Year of Release',
yaxis_title='Number of Games Released'
)
fig.update_layout(width=600, height=500)
fig.show()
In [ ]:
filtered_data = data.dropna(subset=['Critic_Score', 'User_Score'])
fig = px.scatter(filtered_data, x='Critic_Score', y='User_Score', hover_data=['Name'],
title='Critic Score vs User Score')
fig.update_traces(marker=dict(size=8),
selector=dict(mode='markers'))
fig.update_layout(xaxis_title='Critic Score', yaxis_title='User Score')
fig.show()
In [ ]:
publisher_scores = data.groupby('Publisher').agg({'Critic_Score': 'mean', 'User_Score': 'mean'}).reset_index()
fig = px.scatter(publisher_scores, x='Critic_Score', y='User_Score',
title='Mean Critic Score vs Mean User Score per Publisher',
hover_data={'Publisher': True, 'Critic_Score': True, 'User_Score': True})
fig.update_traces(marker=dict(size=8),
selector=dict(mode='markers'))
fig.update_layout(xaxis_title='Mean Critic Score', yaxis_title='Mean User Score',
hovermode='closest', hoverlabel=dict(bgcolor="white", font_size=12))
fig.update_layout(width=800, height=600)
fig.show()
In [ ]:
publisher_sales = data.groupby('Publisher')['Global_Sales'].sum().reset_index()
top_publishers = publisher_sales.nlargest(10, 'Global_Sales')
fig = px.bar(top_publishers, x='Global_Sales', y='Publisher', orientation='h',
title='Top 10 Publishers by Global Sales',
labels={'Global_Sales': 'Total Global Sales', 'Publisher': 'Publisher'})
fig.update_layout(xaxis_title='Total Global Sales', yaxis_title='Publisher', yaxis_categoryorder='total ascending',
yaxis={'categoryorder': 'total ascending'}, width=800, height=350)
fig.show()
In [ ]:
publisher_games = data.groupby('Publisher')['Name'].count().reset_index()
publisher_games.columns = ['Publisher', 'Number of Games']
top_publishers_games = publisher_games.nlargest(10, 'Number of Games')
fig = px.bar(top_publishers_games, x='Number of Games', y='Publisher', orientation='h',
title='Top 10 Publishers by Number of Games Released',
labels={'Number of Games': 'Number of Games Released', 'Publisher': 'Publisher'})
for index, row in top_publishers_games.iterrows():
fig.add_annotation(x=row['Number of Games'], y=row['Publisher'], text=row['Number of Games'],
font=dict(color='white', size=12), showarrow=False, xshift=-20)
fig.update_layout(xaxis_title='Number of Games Released', yaxis_title='Publisher', yaxis_categoryorder='total ascending',
yaxis={'categoryorder': 'total ascending'}, width=800, height=350)
fig.show()
In [ ]:
publisher_sales = data.groupby('Publisher')['Global_Sales'].sum().reset_index()
top_publishers = publisher_sales.nlargest(10, 'Global_Sales')
publisher_games = data.groupby('Publisher')['Name'].count().reset_index()
publisher_games.columns = ['Publisher', 'Number of Games']
top_publishers_games = publisher_games.nlargest(10, 'Number of Games')
merged_data = pd.merge(top_publishers, top_publishers_games, on='Publisher', how='outer')
fig = go.Figure()
fig.add_trace(go.Bar(
y=merged_data['Publisher'],
x=merged_data['Global_Sales'],
name='Global Sales',
orientation='h',
marker_color='#942158'
))
fig.add_trace(go.Bar(
y=merged_data['Publisher'],
x=merged_data['Number of Games'],
name='Number of Games Released',
orientation='h',
marker_color='#d6590f'
))
fig.update_layout(
barmode='group',
title='Top 10 Publishers by Global Sales and their Number of Games Released',
xaxis_title='Value* (*- in million units for Global Sales)',
yaxis_title='Publishers',
yaxis=dict(categoryorder='total ascending'),
legend=dict(x=0.1, y=1.1, orientation='h'),
height=600,
width=1000
)
fig.show()
In [ ]:
publisher_sales = data.groupby('Publisher')['Global_Sales'].sum().reset_index()
top_publishers = publisher_sales.nlargest(10, 'Global_Sales')
publisher_games = data.groupby('Publisher')['Name'].count().reset_index()
publisher_games.columns = ['Publisher', 'Number of Games']
top_publishers_games = publisher_games.nlargest(10, 'Number of Games')
merged_data = pd.merge(top_publishers, top_publishers_games, on='Publisher', how='outer')
fig = px.scatter(merged_data, x='Number of Games', y='Global_Sales', text='Publisher',
title='Top 10 Publishers: Number of Games vs Global Sales',
labels={'Number of Games': 'Number of Games Released', 'Global_Sales': 'Total Global Sales (milion units)'})
fig.update_traces(marker=dict(size=12, color='blue'), mode='markers', textfont=dict(size=10))
fig.update_layout(width=800, height=600)
fig.show()
In [ ]:
top_10_publishers = merged_data.nlargest(10, 'Global_Sales')['Publisher']
top_10_data = data[data['Publisher'].isin(top_10_publishers)]
region_sales = top_10_data.groupby('Publisher')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum().reset_index()
region_sales_melted = region_sales.melt(id_vars='Publisher', value_vars=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'],
var_name='Region', value_name='Sales')
fig = px.bar(region_sales_melted, x='Publisher', y='Sales', color='Region', barmode='stack',
title='Top 10 Publishers: Sales by Region',
labels={'Sales': 'Total Sales', 'Region': 'Region'})
fig.update_layout(xaxis_title='Publisher', yaxis_title='Total Sales', legend_title='Region')
fig.update_layout(width=800, height=600)
fig.show()
In [ ]:
fig = go.Figure()
fig.add_trace(go.Box(y=data['NA_Sales'], name='NA Sales', marker_color='skyblue'))
fig.add_trace(go.Box(y=data['EU_Sales'], name='EU Sales', marker_color='salmon'))
fig.add_trace(go.Box(y=data['JP_Sales'], name='JP Sales', marker_color='lightgreen'))
fig.add_trace(go.Box(y=data['Other_Sales'], name='Other Sales', marker_color='lightcoral'))
fig.update_layout(
title='Box Plot of Sales by Region',
xaxis_title='Region',
yaxis_title='Sales',
boxmode='group'
)
fig.update_layout(width=1000, height=600)
fig.show()
In [ ]:
# I NEED TO REMOVE OUTLIERS WTF
def remove_outliers(series):
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return series[(series >= lower_bound) & (series <= upper_bound)]
temp_data = data.copy()
temp_data['NA_Sales'] = remove_outliers(temp_data['NA_Sales'])
temp_data['EU_Sales'] = remove_outliers(temp_data['EU_Sales'])
temp_data['JP_Sales'] = remove_outliers(temp_data['JP_Sales'])
temp_data['Other_Sales'] = remove_outliers(temp_data['Other_Sales'])
In [ ]:
fig = go.Figure()
fig.add_trace(go.Box(y=temp_data['NA_Sales'], name='NA Sales', marker_color='skyblue'))
fig.add_trace(go.Box(y=temp_data['EU_Sales'], name='EU Sales', marker_color='salmon'))
fig.add_trace(go.Box(y=temp_data['JP_Sales'], name='JP Sales', marker_color='lightgreen'))
fig.add_trace(go.Box(y=temp_data['Other_Sales'], name='Other Sales', marker_color='lightcoral'))
fig.update_layout(
title='Box Plot of Sales by Region',
xaxis_title='Region',
yaxis_title='Sales',
boxmode='group'
)
fig.update_layout(width=1000, height=600)
fig.show()
In [ ]:
print(data[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].describe())
NA_Sales EU_Sales JP_Sales Other_Sales count 16719.000000 16719.000000 16719.000000 16719.000000 mean 0.263330 0.145025 0.077602 0.047332 std 0.813514 0.503283 0.308818 0.186710 min 0.000000 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 0.000000 50% 0.080000 0.020000 0.000000 0.010000 75% 0.240000 0.110000 0.040000 0.030000 max 41.360000 28.960000 10.220000 10.570000
In [ ]:
fig = go.Figure(data=[go.Histogram(x=data['NA_Sales'], nbinsx=30)])
fig.update_layout(
title='Distribution of NA Sales',
xaxis_title='NA Sales',
yaxis_title='Frequency'
)
fig.show()
In [ ]:
fig = px.scatter(data, x='Global_Sales', y='Name', title='Global Sales vs. Games',
labels={'Global_Sales': 'Global Sales', 'Name': 'Games'})
fig.update_traces(marker=dict(size=2))
fig.update_yaxes(showticklabels=False)
fig.show()
In [ ]:
filtered_data = data[data['Global_Sales'] > 20]
fig = px.scatter(filtered_data, x='Name', y='Global_Sales', title='Global Sales vs. Games (Sales > 20)',
labels={'Name': 'Games', 'Global_Sales': 'Global Sales'})
fig.update_traces(marker=dict(size=2))
fig.update_xaxes(showticklabels=False)
fig.update_layout(title_x=0.5)
fig.show()
In [ ]:
bins = list(range(0, 81)) # Range from 0 to 80, inclusive
fig = go.Figure()
fig.add_trace(go.Histogram(x=data['Global_Sales'],
xbins=dict(start=0, end=80, size=0.1),
marker_color='blue'))
fig.update_layout(
title='Histogram of Global Sales',
xaxis_title='Global Sales',
yaxis_title='Count',
)
fig.show()
In [ ]:
first_appearance = data.groupby('Publisher')['Year_of_Release'].min().reset_index()
first_appearance = first_appearance.sort_values(by='Year_of_Release')
fig = px.scatter(first_appearance, x='Year_of_Release', y='Publisher',
title='First Appearance of Publishers Over Time',
labels={'Year_of_Release': 'Year', 'Publisher': 'Publisher'})
fig.update_layout(xaxis_title='Year', yaxis_title='Publisher')
fig.show()
In [ ]:
first_appearance = data.groupby('Publisher')['Year_of_Release'].min().reset_index()
global_sales = data.groupby('Publisher')['Global_Sales'].sum().reset_index()
merged_data = pd.merge(first_appearance, global_sales, on='Publisher', how='inner')
fig = px.scatter(merged_data, x='Year_of_Release', y='Global_Sales', hover_name='Publisher',
title='First Appearance of Publisher vs. Global Sales by Publisher',
labels={'Year_of_Release': 'First Appearance Year', 'Global_Sales': 'Global Sales'},
size_max=2)
fig.update_layout(xaxis_title='Year of First Appearance of a Publisher', yaxis_title='Global Sales')
fig.update_layout(width=1000, height=600)
fig.show()